df <- read_excel("wuhan_blood_sample_data_Jan_Feb_2020.xlsx")
patients_df <- df %>% group_by(`Admission time`, `Discharge time`, gender, age, outcome) %>%
summarise(PATIENT_ID = sum(PATIENT_ID, na.rm = TRUE), `Total blood tests` = n())
patients_df <- patients_df %>%
mutate(`Days in hospital` = ceiling(difftime(`Discharge time`, `Admission time`, units = "days")))
df <- full_join(patients_df %>% ungroup() %>% select(`Admission time`, PATIENT_ID), df %>% select(-PATIENT_ID), by="Admission time")
df$gender<-ifelse(df$gender==1, 'Male', 'Female')
df <- df %>% mutate(gender = as.factor(gender))
patients_df$gender<-ifelse(patients_df$gender==1, 'Male', 'Female')
patients_df <- patients_df %>% mutate(gender = as.factor(gender))
df$outcome<-ifelse(df$outcome==1, 'Death', 'Survival')
df <- df %>% mutate(outcome = as.factor(outcome))
patients_df$outcome<-ifelse(patients_df$outcome==1, 'Death', 'Survival')
patients_df <- patients_df %>% mutate(outcome = as.factor(outcome))
patients_summary <- patients_df %>% ungroup() %>% select(-c(age, PATIENT_ID))
tbl_summary(
patients_summary,
by = outcome,
label = gender ~ "Gender",
) %>%
add_n() %>%
modify_header(label = "") %>%
add_overall() %>%
bold_labels()
| Overall, N = 3751 | N | Survival, N = 2011 | Death, N = 1741 | |
|---|---|---|---|---|
| Gender | 375 | |||
| Male | 224 (60%) | 98 (49%) | 126 (72%) | |
| Female | 151 (40%) | 103 (51%) | 48 (28%) | |
| Total blood tests | 16 (9, 21) | 375 | 16 (12, 20) | 14 (7, 24) |
| Days in hospital | 10 (5, 16) | 375 | 14 (10, 18) | 6 (3, 10) |
|
1
Statistics presented: n (%); Median (IQR)
|
||||
ggplot(patients_df, aes(x=age,fill=gender)) + geom_histogram(binwidth = 1) + facet_grid(. ~ gender) + scale_x_continuous(name="Age", limits=c(min(df$age), max(df$age)), breaks = seq(0, 100, by=10)) + scale_y_continuous(name = "Number of patients", limits = c(0,10), breaks = seq(0,10, by=1)) +
theme_minimal()
ggplot(patients_df, aes(x=age,fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(outcome ~ gender) + scale_x_continuous(name="Age", limits=c(min(df$age), max(df$age)), breaks = seq(0, 100, by=10)) + scale_y_continuous(name = "Number of patients", limits = c(0,9), breaks = seq(0,9, by=1)) +
theme_minimal()
ggplot(patients_df, aes(x=`Days in hospital`, fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(. ~ outcome) + ylab("Number of patients") +
theme_minimal()
ggplot(patients_df, aes(x=`Total blood tests`, fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(. ~ outcome) + ylab("Number of patients") +
scale_x_continuous(name="Total blood tests", limits=c(0, 60), breaks = seq(0, 60, by=10)) +
scale_y_continuous(name = "Number of patients", limits = c(0,20), breaks = seq(0,20, by=2)) +
theme_minimal()
no_dates_df <- df %>% select(-c('Admission time', 'Discharge time', 'RE_DATE', 'PATIENT_ID', 'age', 'gender' ))
tbl_summary(
no_dates_df,
by = outcome,
missing = "no"
) %>%
add_n() %>%
modify_header(label = "**Biomarker**") %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Final patient outcome related to the test**") %>%
bold_labels()
| Biomarker | N | Final patient outcome related to the test | |
|---|---|---|---|
| Death, N = 2,9051 | Survival, N = 3,2151 | ||
| Hypersensitive cardiac troponinI | 507 | 70 (18, 631) | 3 (2, 7) |
| hemoglobin | 975 | 123 (110, 135) | 127 (116, 138) |
| Serum chloride | 975 | 104 (100, 111) | 101 (99, 103) |
| Prothrombin time | 662 | 16.3 (15.0, 18.2) | 13.6 (13.1, 14.1) |
| procalcitonin | 459 | 0.38 (0.14, 1.13) | 0.04 (0.02, 0.06) |
| eosinophils(%) | 957 | 0.00 (0.00, 0.10) | 0.70 (0.00, 1.80) |
| Interleukin 2 receptor | 268 | 1,180 (807, 1,603) | 529 (400, 742) |
| Alkaline phosphatase | 930 | 83 (64, 123) | 60 (50, 75) |
| albumin | 934 | 28 (24, 31) | 36 (34, 39) |
| basophil(%) | 957 | 0.10 (0.10, 0.20) | 0.20 (0.10, 0.40) |
| Interleukin 10 | 267 | 11 (6, 17) | 5 (5, 8) |
| Total bilirubin | 930 | 14 (10, 25) | 8 (6, 12) |
| Platelet count | 957 | 112 (55, 174) | 229 (176, 290) |
| monocytes(%) | 958 | 3.0 (2.0, 4.7) | 8.2 (6.3, 10.0) |
| antithrombin | 330 | 80 (70, 92) | 93 (86, 103) |
| Interleukin 8 | 268 | 30 (18, 61) | 11 (7, 19) |
| indirect bilirubin | 906 | 6.2 (4.2, 9.2) | 4.9 (3.4, 7.1) |
| Red blood cell distribution width | 923 | 13.20 (12.40, 14.40) | 12.20 (11.80, 12.80) |
| neutrophils(%) | 957 | 92 (88, 95) | 66 (56, 76) |
| total protein | 931 | 62 (57, 68) | 68 (65, 72) |
| Quantification of Treponema pallidum antibodies | 279 | 0.06 (0.04, 0.07) | 0.05 (0.04, 0.07) |
| Prothrombin activity | 659 | 66 (56, 78) | 94 (88, 103) |
| HBsAg | 279 | 0.01 (0.00, 0.02) | 0.00 (0.00, 0.01) |
| mean corpuscular volume | 957 | 91.3 (87.1, 96.4) | 89.8 (86.8, 91.9) |
| hematocrit | 957 | 35.9 (32.5, 39.8) | 37.1 (34.3, 39.9) |
| White blood cell count | 1,127 | 12 (8, 17) | 6 (4, 8) |
| Tumor necrosis factorα | 268 | 11 (8, 17) | 8 (6, 10) |
| mean corpuscular hemoglobin concentration | 957 | 342 (331, 350) | 343 (335, 350) |
| fibrinogen | 566 | 3.92 (2.44, 5.63) | 4.40 (3.56, 5.34) |
| Interleukin 1β | 268 | 5.0 (5.0, 5.0) | 5.0 (5.0, 5.0) |
| Urea | 936 | 11 (7, 17) | 4 (3, 5) |
| lymphocyte count | 957 | 0.46 (0.31, 0.69) | 1.25 (0.87, 1.62) |
| PH value | 384 | 6.50 (6.00, 7.41) | 6.50 (6.00, 7.00) |
| Red blood cell count | 1,127 | 4.0 (3.6, 4.6) | 4.2 (3.8, 4.7) |
| Eosinophil count | 957 | 0.00 (0.00, 0.01) | 0.03 (0.00, 0.09) |
| Corrected calcium | 914 | 2.35 (2.27, 2.44) | 2.37 (2.27, 2.44) |
| Serum potassium | 980 | 4.60 (4.04, 5.27) | 4.28 (3.92, 4.62) |
| glucose | 775 | 9.1 (6.9, 13.3) | 5.7 (5.0, 7.6) |
| neutrophils count | 957 | 10.8 (7.0, 15.2) | 3.5 (2.4, 5.2) |
| Direct bilirubin | 930 | 8 (5, 14) | 4 (2, 5) |
| Mean platelet volume | 862 | 11.30 (10.70, 12.20) | 10.40 (9.90, 11.00) |
| ferritin | 283 | 1,636 (928, 2,517) | 504 (235, 834) |
| RBC distribution width SD | 923 | 43.7 (39.9, 48.5) | 39.5 (37.6, 41.4) |
| Thrombin time | 566 | 17.30 (15.80, 19.75) | 16.40 (15.60, 17.30) |
| (%)lymphocyte | 958 | 4 (2, 7) | 24 (16, 33) |
| HCV antibody quantification | 279 | 0.07 (0.04, 0.11) | 0.06 (0.04, 0.08) |
| D-D dimer | 630 | 19 (3, 21) | 1 (0, 1) |
| Total cholesterol | 931 | 3.32 (2.72, 3.88) | 3.93 (3.39, 4.48) |
| aspartate aminotransferase | 935 | 38 (25, 59) | 21 (17, 29) |
| Uric acid | 934 | 245 (166, 374) | 240 (193, 304) |
| HCO3- | 934 | 21.8 (18.8, 24.7) | 24.7 (22.8, 26.7) |
| calcium | 979 | 2.00 (1.90, 2.08) | 2.17 (2.10, 2.25) |
| Amino-terminal brain natriuretic peptide precursor(NT-proBNP) | 475 | 1,467 (516, 4,578) | 64 (23, 166) |
| Lactate dehydrogenase | 934 | 593 (431, 840) | 220 (189, 278) |
| platelet large cell ratio | 862 | 35 (30, 42) | 28 (23, 33) |
| Interleukin 6 | 272 | 66 (30, 142) | 8 (2, 21) |
| Fibrin degradation products | 330 | 114 (18, 150) | 4 (4, 4) |
| monocytes count | 957 | 0.36 (0.20, 0.58) | 0.43 (0.32, 0.58) |
| PLT distribution width | 862 | 13.60 (12.10, 15.93) | 11.70 (10.70, 13.00) |
| globulin | 930 | 34.1 (30.2, 38.2) | 31.8 (29.5, 35.2) |
| γ-glutamyl transpeptidase | 930 | 42 (27, 79) | 29 (19, 46) |
| International standard ratio | 659 | 1.31 (1.17, 1.48) | 1.04 (0.99, 1.09) |
| basophil count(#) | 957 | 0.010 (0.010, 0.030) | 0.010 (0.010, 0.020) |
| 2019-nCoV nucleic acid detection | 501 | ||
| -1 | 57 (100%) | 444 (100%) | |
| mean corpuscular hemoglobin | 957 | 31.20 (29.90, 32.70) | 30.70 (29.60, 31.90) |
| Activation of partial thromboplastin time | 568 | 40 (36, 45) | 39 (35, 43) |
| High sensitivity C-reactive protein | 737 | 114 (65, 191) | 7 (2, 35) |
| HIV antibody quantification | 278 | 0.08 (0.07, 0.11) | 0.09 (0.08, 0.11) |
| serum sodium | 975 | 142 (138, 148) | 140 (138, 141) |
| thrombocytocrit | 862 | 0.15 (0.10, 0.21) | 0.24 (0.19, 0.30) |
| ESR | 383 | 36 (16, 59) | 26 (13, 40) |
| glutamic-pyruvic transaminase | 931 | 26 (18, 44) | 21 (15, 36) |
| eGFR | 936 | 72 (43, 91) | 100 (85, 114) |
| creatinine | 936 | 88 (68, 130) | 64 (54, 83) |
|
1
Statistics presented: Median (IQR); n (%)
|
|||
Finding correlation between outcome and other variables. Ignoring Admission time, PATIENT_ID, RE_DATE and Discharge time and dropping outcome which correlation with itself is equal to 1.0.
cor_df <-df
cor_df$outcome<-ifelse(cor_df$outcome=='Death', 1, 0)
cor_df$gender<-ifelse(cor_df$gender=='Male', 1, 0)
cor_df <- cor_df %>% rename(isMale=gender)
cor_df <- cor_df %>% rename(Death=outcome)
cor_df <- cor_df %>% select (-c(`Admission time`, PATIENT_ID, RE_DATE, `Discharge time` ))
corrMatrix <- cor(cor_df[sapply(cor_df, is.numeric)], use='pairwise.complete.obs')
correlation_df <- as.data.frame(corrMatrix)
correlation_df %>% rownames_to_column('variable') %>% filter(variable != 'Death') %>% select(variable, Death) %>% mutate(Death = abs(Death)) %>%
arrange(desc(Death)) %>%
rename(`Outcome correlation` = Death) %>%
head(10)
| variable | Outcome correlation |
|---|---|
| neutrophils(%) | 0.7258383 |
| (%)lymphocyte | 0.7230203 |
| albumin | 0.6892432 |
| Prothrombin activity | 0.6562405 |
| High sensitivity C-reactive protein | 0.6524696 |
| D-D dimer | 0.6368244 |
| Lactate dehydrogenase | 0.6230357 |
| neutrophils count | 0.6071749 |
| Fibrin degradation products | 0.6044110 |
| age | 0.5860651 |
In following section, we will use 3 biomarkers that are most correlated to outcome and visualize theirs mean values among each patient at 3D graph. So we take only these patients who had all of these 3 biomarkers tested at least once. If somebody was tested more than once, then mean value of all tests for each biomarker is calculated.
new_df <- df
new_df <- new_df %>% select(PATIENT_ID, `neutrophils(%)`, `(%)lymphocyte`, albumin, outcome) %>%
group_by(PATIENT_ID, outcome) %>%
summarise(Neutrophils_mean = mean(`neutrophils(%)`, na.rm = TRUE), Lymphocyte_mean = mean(`(%)lymphocyte`, na.rm = TRUE), albumin_mean = mean(`albumin`, na.rm = TRUE))
new_df <- new_df %>% filter(!is.nan(Neutrophils_mean) & !is.nan(Lymphocyte_mean) & !is.nan(albumin_mean))
mycolors <- c('royalblue1', 'darkcyan')
new_df$color <- mycolors[ as.numeric(new_df$outcome) ]
par(mar=c(0,0,0,0))
plot3d(
x=new_df$Neutrophils_mean, y=new_df$Lymphocyte_mean, z=new_df$albumin_mean,
col = new_df$color,
type = 's',
radius = 1,
legend=TRUE,
xlab="Neutrophils(%)", ylab="Lymphocyte(%)", zlab="Albumin")
legend3d("topright", legend = c('Death', 'Survival'), pch = 10, col = mycolors, cex=0.8, inset=c(0.02))
writeWebGL( filename="3d_correlation_mean.html" , width=600, height=600)
Data after cleaning and transformation:
tbl_summary(
new_df %>% ungroup() %>%rename(`Neutrophils(%)` = Neutrophils_mean, `Lymphocyte(%)` = Lymphocyte_mean, `Albumin`= albumin_mean) %>% select(-PATIENT_ID, -color),
by = outcome
) %>%
add_n() %>%
modify_header(label = "**Mean value of**") %>%
add_overall() %>%
bold_labels()
| Mean value of | Overall, N = 3541 | N | Death, N = 1621 | Survival, N = 1921 |
|---|---|---|---|---|
| Neutrophils(%) | 77 (63, 91) | 354 | 91 (87, 94) | 66 (56, 72) |
| Lymphocyte(%) | 15 (5, 26) | 354 | 5 (3, 8) | 25 (18, 31) |
| Albumin | 33.1 (29.1, 37.0) | 354 | 28.9 (26.3, 31.2) | 36.5 (33.9, 39.4) |
|
1
Statistics presented: Median (IQR)
|
||||
htmltools::includeHTML("./3d_correlation_mean.html")
You must enable Javascript to view this page properly.
biomarker_popularity <- df %>% group_by(PATIENT_ID, outcome) %>% summarise_at(vars(`Hypersensitive cardiac troponinI`:creatinine), function(x) sum(!is.na(x)))
biomarker_popularity$sum_of_single_test <- rowSums(biomarker_popularity %>% ungroup() %>% select(-PATIENT_ID, -outcome))
most_tested_death <- biomarker_popularity %>% select(sum_of_single_test, outcome, `neutrophils(%)`, `(%)lymphocyte`, albumin) %>% filter(outcome=='Death') %>% arrange(desc(sum_of_single_test)) %>% head(10)
most_tested_survival <- biomarker_popularity %>% select(sum_of_single_test, outcome, `neutrophils(%)`, `(%)lymphocyte`, albumin) %>% filter(outcome=='Survival') %>% arrange(desc(sum_of_single_test)) %>% head(10)
foo <- rbind(most_tested_death, most_tested_survival)
patients_tests <- merge(foo %>% ungroup() %>% select(PATIENT_ID), df %>% ungroup() %>% select(PATIENT_ID, RE_DATE,`neutrophils(%)`, `(%)lymphocyte`, albumin, outcome), by="PATIENT_ID") %>% filter(!is.na(`neutrophils(%)`) | !is.na(`(%)lymphocyte`) | !is.na(albumin))
neutrophils_seq <- patients_tests %>% filter(!is.na(`neutrophils(%)`))
neutrophils_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
ggplot( aes(x=RE_DATE, y=`neutrophils(%)`, group=PATIENT_ID, color=PATIENT_ID)) +
geom_line() +
geom_point() +
facet_grid(rows=vars(outcome)) +
ggtitle("Neutrophils (%) during patient hospitalization") +
theme_ipsum() +
ylab("Neutrophils (%)") +
transition_reveal(RE_DATE)
#anim_save("neutrophils_seq.gif")
lymphocyte_seq <- patients_tests %>% filter(!is.na(`(%)lymphocyte`))
lymphocyte_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
ggplot( aes(x=RE_DATE, y=`(%)lymphocyte`, group=PATIENT_ID, color=PATIENT_ID)) +
geom_line() +
geom_point() +
facet_grid(rows=vars(outcome)) +
ggtitle("Lymphocyte (%) during patient hospitalization") +
theme_ipsum() +
ylab("Lymphocyte (%)") +
transition_reveal(RE_DATE)
#anim_save("lymphocyte_seq.gif")
albumin_seq <- patients_tests %>% filter(!is.na(albumin))
albumin_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
ggplot( aes(x=RE_DATE, y=albumin, group=PATIENT_ID, color=PATIENT_ID)) +
geom_line() +
geom_point() +
facet_grid(rows=vars(outcome)) +
ggtitle("Albumin value during patient hospitalization") +
theme_ipsum() +
ylab("Albumin") +
transition_reveal(RE_DATE)
#anim_save("albumin_seq.gif")
#Get last test of each patient for crucial biomarkers
#df %>% select(PATIENT_ID, RE_DATE, age, gender, `neutrophils(%)`, `(%)lymphocyte`, albumin, outcome) %>%
# group_by(PATIENT_ID, age, gender, outcome) %>%
# fill(`neutrophils(%)`, `(%)lymphocyte`, albumin) %>%
# summarise_at(vars(`neutrophils(%)`:albumin), function(x) last(x,order_by = !is.na(x))) %>%
# filter(!is.na(`neutrophils(%)`) & !is.na(`(%)lymphocyte`) & !is.na(albumin))